{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Современные языковые модели на примере GPT-2 и как их применять в диалоговых системах."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Generation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/conda/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:516: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n",
      "/opt/conda/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:517: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n",
      "/opt/conda/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:518: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n",
      "/opt/conda/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:519: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n",
      "/opt/conda/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:520: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n",
      "/opt/conda/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:525: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm_notebook as tqdm\n",
    "\n",
    "import numpy as np\n",
    "import torch\n",
    "from transformers import GPT2LMHeadModel, GPT2Tokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "weights_shortcut = 'gpt2'\n",
    "\n",
    "tokenizer = GPT2Tokenizer.from_pretrained(weights_shortcut)\n",
    "model = GPT2LMHeadModel.from_pretrained(weights_shortcut)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "prompt_text = 'My name is'\n",
    "encoded_prompt = tokenizer.encode(prompt_text, return_tensors=\"pt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([[3666, 1438,  318]])"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "encoded_prompt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')\n",
    "model.to(device)\n",
    "encoded_prompt = encoded_prompt.to(device)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "My name is John. I'm a man of God. I'm a man of God.\n"
     ]
    }
   ],
   "source": [
    "encoded_result = model.generate(encoded_prompt, \n",
    "                                eos_token_ids=tokenizer.eos_token_id\n",
    "                               )\n",
    "result = tokenizer.decode(encoded_result[0], skip_special_tokens=True)\n",
    "print(result)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Training"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Dataset is preprocessed from here: https://github.com/square/MimicAndRephrase/tree/master/datasets/Sentiment/Sentiment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "from torch.utils.data import DataLoader\n",
    "\n",
    "def get_dataset_tensor(dataset_path):\n",
    "    with open(dataset_path) as f:\n",
    "        tokenized_dataset = [tokenizer.encode(line) for line in f]\n",
    "\n",
    "    samples_num = len(tokenized_dataset)\n",
    "    max_tokens_num = max(map(len, tokenized_dataset))\n",
    "\n",
    "    input_ids = np.full((samples_num, max_tokens_num), tokenizer.pad_token_id, dtype=np.int64)\n",
    "    for i, tokens in enumerate(tokenized_dataset):\n",
    "        input_ids[i, :len(tokens)] = tokens\n",
    "\n",
    "    return torch.from_numpy(input_ids)\n",
    "\n",
    "tokenizer.pad_token = tokenizer.eos_token\n",
    "\n",
    "train_data_tensor = get_dataset_tensor(dataset_path='paraphrase_dataset.txt')\n",
    "train_dataloader = DataLoader(train_data_tensor, batch_size=16, shuffle=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AdamW, get_linear_schedule_with_warmup\n",
    "\n",
    "def train_model(model, training_data, epochs_num):\n",
    "    optimizer = AdamW(model.parameters(), lr=3e-5, weight_decay=1)\n",
    "\n",
    "    train_loss = []\n",
    "\n",
    "    for _ in tqdm(range(epochs_num), total=epochs_num):\n",
    "        for input_ids in training_data:\n",
    "            model.train()\n",
    "\n",
    "            input_ids = input_ids.to(device)\n",
    "            loss = model(input_ids, labels=input_ids)[0]\n",
    "            loss.backward()\n",
    "            optimizer.step()\n",
    "            optimizer.zero_grad()\n",
    "\n",
    "        train_loss.append(loss.item())\n",
    "                \n",
    "    return model, train_loss"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Jack lost my keys -> I am sorry to hear your laptop accidentally lost something!\n"
     ]
    }
   ],
   "source": [
    "encoded_prompt = tokenizer.encode('Jack lost my keys -> ', return_tensors=\"pt\").to(device)\n",
    "encoded_result = model.generate(encoded_prompt, \n",
    "                                eos_token_ids=tokenizer.eos_token_id,\n",
    "                                do_sample=True)\n",
    "result = tokenizer.decode(encoded_result[0], skip_special_tokens=True)\n",
    "print(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "490c016d6b5346a3bf9c3fa5967e26ca",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, max=2), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "finetuned_model, metrics_history = train_model(model, train_dataloader, epochs_num=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "I bought a new  car -> I am glad you bought a new car!!\n",
      "I bought a new  car -> I am glad you bought a new car!!\n",
      "I bought a new  car -> I am glad to hear about your new car!\n",
      "I bought a new  car -> I am glad you bought a new car!!\n",
      "I bought a new  car -> I am glad you bought a new car!!\n"
     ]
    }
   ],
   "source": [
    "encoded_prompt = tokenizer.encode('I bought a new car -> ', return_tensors=\"pt\").to(device)\n",
    "encoded_result = finetuned_model.generate(encoded_prompt, \n",
    "                                          eos_token_ids=tokenizer.eos_token_id,\n",
    "                                          num_return_sequences=5)\n",
    "for cur_sample_tokens in encoded_result[0]:\n",
    "    print(tokenizer.decode(cur_sample_tokens, skip_special_tokens=True))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Next steps\n",
    "* Compute validation metrics: perplexity/BLEU/ROUGE\n",
    "* Logging into tensorboard\n",
    "* Generate N candidates and filter or rerank\n",
    "* Analyze errors and improve dataset\n",
    "* Improve training: masking, `lr_scheduler`, multi-gpu training\n",
    "* Improve generation: try different strategies\n",
    "* Improve the model: use bigger model, try different architecrures (DialoGPT2, XLNet, CTRL  etc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
